{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OPIK: Existing Opik clients will not use updated values for \"url\", \"api_key\", \"workspace\".\n",
      "OPIK: Opik is already configured. You can check the settings by viewing the config file at /Users/akshay/.opik.config\n"
     ]
    }
   ],
   "source": [
    "import opik\n",
    "opik.configure(use_local=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dotenv import load_dotenv\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup Workflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from workflow import RAGWorkflow\n",
    "import asyncio\n",
    "\n",
    "def load_workflow(model_option):\n",
    "    if model_option == \"Gemma3\":\n",
    "        workflow = RAGWorkflow(model_name=\"gemma3\")\n",
    "    else:\n",
    "        workflow = RAGWorkflow(model_name=\"llama3.2\")\n",
    "    return workflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = 'Gemma3'\n",
    "# model_name = 'DeepSeek-R1'\n",
    "workflow  = load_workflow(model_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Trace RAG calls "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core import Settings\n",
    "from llama_index.core.callbacks import CallbackManager\n",
    "from opik.integrations.llama_index import LlamaIndexCallbackHandler\n",
    "\n",
    "# A callback handler tp automatically log all LlamaIndex operations to Opik\n",
    "opik_callback_handler = LlamaIndexCallbackHandler()\n",
    "\n",
    "# Integrate handler into LlamaIndex's settings\n",
    "Settings.callback_manager = CallbackManager([opik_callback_handler])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OPIK: Started logging traces to the \"Default Project\" project at https://www.comet.com/opik/akshayp/redirect/projects?name=Default%20Project.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x33e383c80>"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "await workflow.ingest_documents(\"./eval-data/paul_graham\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Paul is an entrepreneur who made Y Combinatore (YC), a startup accelerator that funds new startups. He was a programmer in his adolescence and later decided to study philosophy in college before realizing his passion for AI. In 2010, Robert Morris’s advice led him to realize he needed to hand over YC and he began painting as a new activity. He ultimately gave up painting after losing interest in the project.\n"
     ]
    }
   ],
   "source": [
    "response = await workflow.query(\"Who is Paul Graham?\")\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik import Opik\n",
    "\n",
    "client = Opik()\n",
    "dataset = client.get_or_create_dataset(name=\"Test dataset\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Question</th>\n",
       "      <th>Answer</th>\n",
       "      <th>Context</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>What was the very first programming language P...</td>\n",
       "      <td>He used an early version of Fortran on the IBM...</td>\n",
       "      <td>The language we used was an early version of F...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Which microcomputer did Paul Graham’s father f...</td>\n",
       "      <td>A TRS-80.</td>\n",
       "      <td>Computers were expensive in those days and it ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>What was the name of the startup Paul Graham c...</td>\n",
       "      <td>Viaweb.</td>\n",
       "      <td>We started a new company we called Viaweb, aft...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Which friend of Paul Graham was the person res...</td>\n",
       "      <td>Robert Tappan Morris (often referred to as “Ro...</td>\n",
       "      <td>I remember when my friend Robert Morris got ki...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>What was the title of the second Lisp book tha...</td>\n",
       "      <td>*ANSI Common Lisp.*</td>\n",
       "      <td>So with my unerring nose for financial opportu...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            Question  \\\n",
       "0  What was the very first programming language P...   \n",
       "1  Which microcomputer did Paul Graham’s father f...   \n",
       "2  What was the name of the startup Paul Graham c...   \n",
       "3  Which friend of Paul Graham was the person res...   \n",
       "4  What was the title of the second Lisp book tha...   \n",
       "\n",
       "                                              Answer  \\\n",
       "0  He used an early version of Fortran on the IBM...   \n",
       "1                                          A TRS-80.   \n",
       "2                                            Viaweb.   \n",
       "3  Robert Tappan Morris (often referred to as “Ro...   \n",
       "4                                *ANSI Common Lisp.*   \n",
       "\n",
       "                                             Context  \n",
       "0  The language we used was an early version of F...  \n",
       "1  Computers were expensive in those days and it ...  \n",
       "2  We started a new company we called Viaweb, aft...  \n",
       "3  I remember when my friend Robert Morris got ki...  \n",
       "4  So with my unerring nose for financial opportu...  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"./eval-data/test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input': 'What was the very first programming language Paul Graham used when he began learning to program on the IBM 1401?',\n",
       " 'expected_output': 'He used an early version of Fortran on the IBM 1401.',\n",
       " 'context': 'The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the card reader and press a button to load the program into memory and run it.'}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# insert the data into the dataset\n",
    "\n",
    "qa_pairs = [\n",
    "    {\"input\": row[\"Question\"], \"expected_output\": row[\"Answer\"], \"context\": row[\"Context\"]} \n",
    "    for _, row in df.iterrows()\n",
    "]\n",
    "qa_pairs[0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.insert(qa_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik import track\n",
    "\n",
    "@track\n",
    "async def my_llm_application(input: str) -> str:\n",
    "    response = await workflow.query(input)\n",
    "    return str(response)\n",
    "\n",
    "def evaluation_task(x):\n",
    "    return {\n",
    "        \"output\": my_llm_application(x['input'])\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik.evaluation.metrics import (\n",
    "    Hallucination,\n",
    "    AnswerRelevance,\n",
    "    ContextPrecision,\n",
    "    ContextRecall\n",
    ")\n",
    "\n",
    "# Define the metrics\n",
    "hallucination_metric = Hallucination()\n",
    "answer_relevance_metric = AnswerRelevance()\n",
    "context_precision_metric = ContextPrecision()\n",
    "context_recall_metric = ContextRecall() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Evaluation: 100%|██████████| 5/5 [00:15<00:00,  3.09s/it]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭─ Test dataset (5 samples) ─────────────╮\n",
       "│                                        │\n",
       "│ <span style=\"font-weight: bold\">Total time:       </span> 00:00:16            │\n",
       "│ <span style=\"font-weight: bold\">Number of samples:</span> 5                   │\n",
       "│                                        │\n",
       "│ <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">hallucination_metric: 0.8000 (avg)</span>     │\n",
       "│ <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">answer_relevance_metric: 0.2500 (avg)</span>  │\n",
       "│ <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">context_precision_metric: 0.0000 (avg)</span> │\n",
       "│ <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">context_recall_metric: 0.0000 (avg)</span>    │\n",
       "│                                        │\n",
       "╰────────────────────────────────────────╯\n",
       "</pre>\n"
      ],
      "text/plain": [
       "╭─ Test dataset (5 samples) ─────────────╮\n",
       "│                                        │\n",
       "│ \u001b[1mTotal time:       \u001b[0m 00:00:16            │\n",
       "│ \u001b[1mNumber of samples:\u001b[0m 5                   │\n",
       "│                                        │\n",
       "│ \u001b[1;32mhallucination_metric: 0.8000 (avg)\u001b[0m     │\n",
       "│ \u001b[1;32manswer_relevance_metric: 0.2500 (avg)\u001b[0m  │\n",
       "│ \u001b[1;32mcontext_precision_metric: 0.0000 (avg)\u001b[0m │\n",
       "│ \u001b[1;32mcontext_recall_metric: 0.0000 (avg)\u001b[0m    │\n",
       "│                                        │\n",
       "╰────────────────────────────────────────╯\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploading results to Opik <span style=\"color: #808000; text-decoration-color: #808000\">...</span> \n",
       "</pre>\n"
      ],
      "text/plain": [
       "Uploading results to Opik \u001b[33m...\u001b[0m \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">View the results <a href=\"https://www.comet.com/opik/akshayp/experiments/0195b48e-0ec4-73a9-adaf-36e16c7ea3e8/compare?experiments=%5B%220195b4bb-b6fa-79e4-8fe5-18e2a27a61b8%22%5D\" target=\"_blank\">in your Opik dashboard</a>.\n",
       "</pre>\n"
      ],
      "text/plain": [
       "View the results \u001b]8;id=855507;https://www.comet.com/opik/akshayp/experiments/0195b48e-0ec4-73a9-adaf-36e16c7ea3e8/compare?experiments=%5B%220195b4bb-b6fa-79e4-8fe5-18e2a27a61b8%22%5D\u001b\\in your Opik dashboard\u001b]8;;\u001b\\.\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from opik.evaluation import evaluate\n",
    "\n",
    "evaluation = evaluate(\n",
    "    dataset=dataset,\n",
    "    task=evaluation_task,\n",
    "    experiment_name = model_name,\n",
    "    scoring_metrics=[hallucination_metric, answer_relevance_metric, context_precision_metric, context_recall_metric],\n",
    "    experiment_config={\n",
    "        \"model\": \"gpt-3.5-turbo\"\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env_llamaindex",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
